{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "## This file performs alternative random forest specifications"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Intel(R) Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from pandas.tseries.offsets import *\n",
    "from tqdm import tqdm\n",
    "from functools import reduce\n",
    "import statsmodels.api as sm\n",
    "# import scipy.stats as stats\n",
    "from linearmodels import PanelOLS\n",
    "import os\n",
    "from sklearnex import patch_sklearn\n",
    "patch_sklearn()\n",
    "\n",
    "from sklearn.model_selection import ParameterGrid\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "\n",
    "from functions import utils\n",
    "from functions import summary2\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\", category=DeprecationWarning)\n",
    "\n",
    "plt.rcParams['font.sans-serif']=['Times New Roman']\n",
    "plt.rcParams.update({'font.size':13})\n",
    "plt.rcParams['xtick.direction'] = 'in'\n",
    "plt.rcParams['ytick.direction'] = 'in'\n",
    "plt.rcParams['grid.color'] = 'gray'\n",
    "plt.rcParams['grid.linestyle'] = '--'\n",
    "%config InlineBackend.figure_format = 'retina'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ratio_chars = ['CAPEI', 'bm',\n",
    "       'evm', 'pe_exi', 'pe_inc', 'ps', 'pcf',\n",
    "       'dpr', 'npm', 'opmbd', 'opmad', 'gpm', 'ptpm', 'cfm', 'roa', 'roe',\n",
    "       'roce', 'efftax', 'aftret_eq', 'aftret_invcapx', 'aftret_equity',\n",
    "       'pretret_noa', 'pretret_earnat', 'GProf', 'equity_invcap',\n",
    "       'debt_invcap', 'totdebt_invcap', 'capital_ratio', 'int_debt',\n",
    "       'int_totdebt', 'cash_lt', 'invt_act', 'rect_act', 'debt_at',\n",
    "       'debt_ebitda', 'short_debt', 'curr_debt', 'lt_debt', 'profit_lct',\n",
    "       'ocf_lct', 'cash_debt', 'fcf_ocf', 'lt_ppent', 'dltt_be', 'debt_assets',\n",
    "       'debt_capital', 'de_ratio', 'intcov', 'intcov_ratio', 'cash_ratio',\n",
    "       'quick_ratio', 'curr_ratio', 'cash_conversion', 'inv_turn', 'at_turn',\n",
    "       'rect_turn', 'pay_turn', 'sale_invcap', 'sale_equity', 'sale_nwc',\n",
    "       'rd_sale', 'adv_sale', 'staff_sale', 'accrual', 'ptb', 'PEG_trailing',\n",
    "       'divyield']\n",
    "\n",
    "per_share_chars = ['dividend_p','BE_p','Liability_p','cur_liability_p','LT_debt_p',\n",
    "                  'cash_p', 'total_asset_p', 'tot_debt_p', 'accrual_p', 'EBIT_p',\n",
    "                   'cur_asset_p', 'pbda_p', 'ocf_p', 'inventory_p', 'receivables_p',\n",
    "                   'Cur_debt_p', 'interest_p', 'fcf_ocf_p', 'evm_p',\n",
    "                   'sales_p', 'invcap_p', 'c_equity_p', 'rd_p', 'opmad_p', 'gpm_p','ptpm_p'\n",
    "                  ]\n",
    "\n",
    "macro_chars = ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "\n",
    "fundamental_chars = ['ret', 'prc',\n",
    "                    'EPS_true_l1_q1','EPS_true_l1_q2','EPS_true_l1_q3',\n",
    "                    'EPS_true_l1_y1','EPS_true_l1_y2',\n",
    "                    ]\n",
    "\n",
    "analyst_chars = ['EPS_ana_q1','EPS_ana_q2','EPS_ana_q3','EPS_ana_y1','EPS_ana_y2']\n",
    "\n",
    "targets = ['EPS_true_q1', 'EPS_true_q2', 'EPS_true_q3', 'EPS_true_y1', 'EPS_true_y2']\n",
    "\n",
    "df_tmp = pd.read_parquet('../data/Results/df_train_new.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [13:02<00:00,  1.92s/it]\n"
     ]
    }
   ],
   "source": [
    "#############################\n",
    "### Sample Fraction: 0.01 ###\n",
    "#############################\n",
    "time_idx = sorted(df_tmp['YearMonth'].unique())\n",
    "time_idx = [i for i in time_idx if i > pd.to_datetime('1986-01-01')]\n",
    "max_samples = 0.01\n",
    "window = 12\n",
    "n_jobs = 12\n",
    "num_trees = 2000\n",
    "forecast = []\n",
    "for t in tqdm(time_idx):\n",
    "\n",
    "    ### Q1 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q1'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q1'] + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "    # break\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q1':y_pred, 'AF_q1':df_test['EPS_ana_q1'], 'AE_q1':df_test['EPS_true_q1']})\n",
    "\n",
    "    ### Q2 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q2'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q2']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q2':y_pred, 'AF_q2':df_test['EPS_ana_q2'], 'AE_q2':df_test['EPS_true_q2']})\n",
    "\n",
    "    ### Q3 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q3'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q3'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q3']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q3']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q3 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q3':y_pred, 'AF_q3':df_test['EPS_ana_q3'], 'AE_q3':df_test['EPS_true_q3']})\n",
    "\n",
    "    ### Y1 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_y1'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y1']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_y1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_y1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_y1':y_pred, 'AF_y1':df_test['EPS_ana_y1'], 'AE_y1':df_test['EPS_true_y1']})\n",
    "\n",
    "    ### Y2 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_y2'\n",
    "    if window <= 24:\n",
    "        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(24)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\\\n",
    "                   .dropna(subset=x_cols+[y_col])\n",
    "    else:\n",
    "        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\\\n",
    "                   .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_y2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_y2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_y2':y_pred, 'AF_y2':df_test['EPS_ana_y2'], 'AE_y2':df_test['EPS_true_y2']})\n",
    "\n",
    "    forecast.append(reduce(lambda x,y: pd.merge(x,y,\n",
    "                                       on=['permno','YearMonth'],\n",
    "                                       how='outer'),\n",
    "                 [forecast_q1,forecast_q2,forecast_q3,\n",
    "                  forecast_y1,forecast_y2]))\n",
    "    # break\n",
    "forecast_all = pd.concat(forecast,axis=0).reset_index(drop=True)\n",
    "forecast_all.to_parquet(f'../data/Results/RF_variants/RF_{max_samples}_{window}.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [33:28<00:00,  4.92s/it]\n"
     ]
    }
   ],
   "source": [
    "#############################\n",
    "### Sample Fraction: 0.10 ###\n",
    "#############################\n",
    "time_idx = sorted(df_tmp['YearMonth'].unique())\n",
    "time_idx = [i for i in time_idx if i > pd.to_datetime('1986-01-01')]\n",
    "max_samples = 0.1\n",
    "window = 12\n",
    "n_jobs = 12\n",
    "num_trees = 2000\n",
    "forecast = []\n",
    "for t in tqdm(time_idx):\n",
    "\n",
    "    ### Q1 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q1'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q1'] + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "    # break\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q1':y_pred, 'AF_q1':df_test['EPS_ana_q1'], 'AE_q1':df_test['EPS_true_q1']})\n",
    "\n",
    "    ### Q2 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q2'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q2']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q2':y_pred, 'AF_q2':df_test['EPS_ana_q2'], 'AE_q2':df_test['EPS_true_q2']})\n",
    "\n",
    "    ### Q3 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q3'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q3'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q3']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q3']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q3 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q3':y_pred, 'AF_q3':df_test['EPS_ana_q3'], 'AE_q3':df_test['EPS_true_q3']})\n",
    "\n",
    "    ### Y1 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_y1'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y1']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_y1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_y1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_y1':y_pred, 'AF_y1':df_test['EPS_ana_y1'], 'AE_y1':df_test['EPS_true_y1']})\n",
    "\n",
    "    ### Y2 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_y2'\n",
    "    if window <= 24:\n",
    "        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(24)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\\\n",
    "                   .dropna(subset=x_cols+[y_col])\n",
    "    else:\n",
    "        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\\\n",
    "                   .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_y2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_y2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_y2':y_pred, 'AF_y2':df_test['EPS_ana_y2'], 'AE_y2':df_test['EPS_true_y2']})\n",
    "\n",
    "    forecast.append(reduce(lambda x,y: pd.merge(x,y,\n",
    "                                       on=['permno','YearMonth'],\n",
    "                                       how='outer'),\n",
    "                 [forecast_q1,forecast_q2,forecast_q3,\n",
    "                  forecast_y1,forecast_y2]))\n",
    "    # break\n",
    "forecast_all = pd.concat(forecast,axis=0).reset_index(drop=True)\n",
    "forecast_all.to_parquet(f'../data/Results/RF_variants/RF_{max_samples}_{window}.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [22:53<00:00,  3.37s/it]\n"
     ]
    }
   ],
   "source": [
    "##############################\n",
    "### Feature Fraction: sqrt ###\n",
    "##############################\n",
    "time_idx = sorted(df_tmp['YearMonth'].unique())\n",
    "time_idx = [i for i in time_idx if i > pd.to_datetime('1986-01-01')]\n",
    "max_samples = 0.05\n",
    "window = 12\n",
    "n_jobs = 12\n",
    "num_trees = 2000\n",
    "forecast = []\n",
    "feature_importance = []\n",
    "for t in tqdm(time_idx):\n",
    "\n",
    "    ### Q1 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q1'\n",
    "\n",
    "    feature_importance_t = pd.DataFrame(index=x_cols,\n",
    "                                    columns=['q1','q2','q3','y1','y2'])\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q1'] + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "    # break\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             max_features='sqrt',\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    feature_importance_t['q1'] = mdl.feature_importances_\n",
    "\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q1':y_pred, 'AF_q1':df_test['EPS_ana_q1'], 'AE_q1':df_test['EPS_true_q1']})\n",
    "\n",
    "    ### Q2 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q2'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q2']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             max_features='sqrt',\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "\n",
    "    feature_importance_t['q2'] = mdl.feature_importances_\n",
    "\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q2':y_pred, 'AF_q2':df_test['EPS_ana_q2'], 'AE_q2':df_test['EPS_true_q2']})\n",
    "\n",
    "    ### Q3 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q3'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q3'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q3']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q3']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             max_features='sqrt',\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    feature_importance_t['q3'] = mdl.feature_importances_\n",
    "\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q3 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q3':y_pred, 'AF_q3':df_test['EPS_ana_q3'], 'AE_q3':df_test['EPS_true_q3']})\n",
    "\n",
    "    ### Y1 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_y1'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y1']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_y1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             max_features='sqrt',\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "\n",
    "    feature_importance_t['y1'] = mdl.feature_importances_\n",
    "\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_y1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_y1':y_pred, 'AF_y1':df_test['EPS_ana_y1'], 'AE_y1':df_test['EPS_true_y1']})\n",
    "\n",
    "    ### Y2 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_y2'\n",
    "    if window <= 24:\n",
    "        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(24)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\\\n",
    "                   .dropna(subset=x_cols+[y_col])\n",
    "    else:\n",
    "        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\\\n",
    "                   .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_y2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             max_features='sqrt',\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "\n",
    "    feature_importance_t['y2'] = mdl.feature_importances_\n",
    "\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_y2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_y2':y_pred, 'AF_y2':df_test['EPS_ana_y2'], 'AE_y2':df_test['EPS_true_y2']})\n",
    "\n",
    "    forecast.append(reduce(lambda x,y: pd.merge(x,y,\n",
    "                                       on=['permno','YearMonth'],\n",
    "                                       how='outer'),\n",
    "                 [forecast_q1,forecast_q2,forecast_q3,\n",
    "                  forecast_y1,forecast_y2]))\n",
    "\n",
    "    feature_importance_t['YearMonth'] = t\n",
    "    feature_importance.append(feature_importance_t.reset_index())\n",
    "\n",
    "forecast_all = pd.concat(forecast,axis=0).reset_index(drop=True)\n",
    "forecast_all.to_parquet(f'../data/Results/RF_variants/RF_{max_samples}_{window}_feature_sqrt.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [23:54<00:00,  3.52s/it]\n"
     ]
    }
   ],
   "source": [
    "##############################\n",
    "### Feature Fraction: 0.5  ###\n",
    "##############################\n",
    "time_idx = sorted(df_tmp['YearMonth'].unique())\n",
    "time_idx = [i for i in time_idx if i > pd.to_datetime('1986-01-01')]\n",
    "max_samples = 0.05\n",
    "window = 12\n",
    "n_jobs = 12\n",
    "num_trees = 2000\n",
    "forecast = []\n",
    "feature_importance = []\n",
    "for t in tqdm(time_idx):\n",
    "\n",
    "    ### Q1 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q1'\n",
    "\n",
    "    feature_importance_t = pd.DataFrame(index=x_cols,\n",
    "                                    columns=['q1','q2','q3','y1','y2'])\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q1'] + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "    # break\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             max_features=0.5,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    feature_importance_t['q1'] = mdl.feature_importances_\n",
    "\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q1':y_pred, 'AF_q1':df_test['EPS_ana_q1'], 'AE_q1':df_test['EPS_true_q1']})\n",
    "\n",
    "    ### Q2 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q2'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q2']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             max_features=0.5,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "\n",
    "    feature_importance_t['q2'] = mdl.feature_importances_\n",
    "\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q2':y_pred, 'AF_q2':df_test['EPS_ana_q2'], 'AE_q2':df_test['EPS_true_q2']})\n",
    "\n",
    "    ### Q3 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q3'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q3'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q3']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q3']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             max_features=0.5,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    feature_importance_t['q3'] = mdl.feature_importances_\n",
    "\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q3 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q3':y_pred, 'AF_q3':df_test['EPS_ana_q3'], 'AE_q3':df_test['EPS_true_q3']})\n",
    "\n",
    "    ### Y1 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_y1'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y1']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_y1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             max_features=0.5,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "\n",
    "    feature_importance_t['y1'] = mdl.feature_importances_\n",
    "\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_y1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_y1':y_pred, 'AF_y1':df_test['EPS_ana_y1'], 'AE_y1':df_test['EPS_true_y1']})\n",
    "\n",
    "    ### Y2 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_y2'\n",
    "    if window <= 24:\n",
    "        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(24)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\\\n",
    "                   .dropna(subset=x_cols+[y_col])\n",
    "    else:\n",
    "        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\\\n",
    "                   .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_y2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             max_features=0.5,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "\n",
    "    feature_importance_t['y2'] = mdl.feature_importances_\n",
    "\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_y2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_y2':y_pred, 'AF_y2':df_test['EPS_ana_y2'], 'AE_y2':df_test['EPS_true_y2']})\n",
    "\n",
    "    forecast.append(reduce(lambda x,y: pd.merge(x,y,\n",
    "                                       on=['permno','YearMonth'],\n",
    "                                       how='outer'),\n",
    "                 [forecast_q1,forecast_q2,forecast_q3,\n",
    "                  forecast_y1,forecast_y2]))\n",
    "\n",
    "    feature_importance_t['YearMonth'] = t\n",
    "    feature_importance.append(feature_importance_t.reset_index())\n",
    "\n",
    "    # break\n",
    "\n",
    "forecast_all = pd.concat(forecast,axis=0).reset_index(drop=True)\n",
    "forecast_all.to_parquet(f'../data/Results/RF_variants/RF_{max_samples}_{window}_feature_0.5.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [47:35<00:00,  7.00s/it]\n"
     ]
    }
   ],
   "source": [
    "##############################\n",
    "### Training Window: 3 years #\n",
    "##############################\n",
    "time_idx = sorted(df_tmp['YearMonth'].unique())\n",
    "time_idx = [i for i in time_idx if i > pd.to_datetime('1986-01-01')]\n",
    "max_samples = 0.05\n",
    "window = 36\n",
    "n_jobs = 16\n",
    "num_trees = 2000\n",
    "forecast = []\n",
    "for t in tqdm(time_idx):\n",
    "\n",
    "    ### Q1 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q1'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q1'] + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "    # break\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q1':y_pred, 'AF_q1':df_test['EPS_ana_q1'], 'AE_q1':df_test['EPS_true_q1']})\n",
    "\n",
    "    ### Q2 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q2'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q2']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q2':y_pred, 'AF_q2':df_test['EPS_ana_q2'], 'AE_q2':df_test['EPS_true_q2']})\n",
    "\n",
    "    ### Q3 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q3'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q3'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q3']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q3']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q3 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q3':y_pred, 'AF_q3':df_test['EPS_ana_q3'], 'AE_q3':df_test['EPS_true_q3']})\n",
    "\n",
    "    ### Y1 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_y1'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y1']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_y1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_y1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_y1':y_pred, 'AF_y1':df_test['EPS_ana_y1'], 'AE_y1':df_test['EPS_true_y1']})\n",
    "\n",
    "    ### Y2 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_y2'\n",
    "    if window <= 24:\n",
    "        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(24)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\\\n",
    "                   .dropna(subset=x_cols+[y_col])\n",
    "    else:\n",
    "        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\\\n",
    "                   .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_y2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_y2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_y2':y_pred, 'AF_y2':df_test['EPS_ana_y2'], 'AE_y2':df_test['EPS_true_y2']})\n",
    "\n",
    "    forecast.append(reduce(lambda x,y: pd.merge(x,y,\n",
    "                                       on=['permno','YearMonth'],\n",
    "                                       how='outer'),\n",
    "                 [forecast_q1,forecast_q2,forecast_q3,\n",
    "                  forecast_y1,forecast_y2]))\n",
    "    # break\n",
    "forecast_all = pd.concat(forecast,axis=0).reset_index(drop=True)\n",
    "forecast_all.to_parquet(f'../data/Results/RF_variants/RF_{max_samples}_{window}.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|          | 0/408 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [1:27:32<00:00, 12.87s/it]\n"
     ]
    }
   ],
   "source": [
    "##############################\n",
    "### Training Window: 5 years #\n",
    "##############################\n",
    "time_idx = sorted(df_tmp['YearMonth'].unique())\n",
    "time_idx = [i for i in time_idx if i > pd.to_datetime('1986-01-01')]\n",
    "max_samples = 0.05\n",
    "window = 60\n",
    "n_jobs = 16\n",
    "num_trees = 2000\n",
    "forecast = []\n",
    "for t in tqdm(time_idx):\n",
    "\n",
    "    ### Q1 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q1'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q1'] + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "    # break\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q1':y_pred, 'AF_q1':df_test['EPS_ana_q1'], 'AE_q1':df_test['EPS_true_q1']})\n",
    "\n",
    "    ### Q2 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q2'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q2']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q2':y_pred, 'AF_q2':df_test['EPS_ana_q2'], 'AE_q2':df_test['EPS_true_q2']})\n",
    "\n",
    "    ### Q3 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q3'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q3'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q3']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q3']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q3 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q3':y_pred, 'AF_q3':df_test['EPS_ana_q3'], 'AE_q3':df_test['EPS_true_q3']})\n",
    "\n",
    "    ### Y1 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_y1'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y1']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_y1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_y1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_y1':y_pred, 'AF_y1':df_test['EPS_ana_y1'], 'AE_y1':df_test['EPS_true_y1']})\n",
    "\n",
    "    ### Y2 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_y2'\n",
    "    if window <= 24:\n",
    "        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(24)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\\\n",
    "                   .dropna(subset=x_cols+[y_col])\n",
    "    else:\n",
    "        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\\\n",
    "                   .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_y2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_y2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_y2':y_pred, 'AF_y2':df_test['EPS_ana_y2'], 'AE_y2':df_test['EPS_true_y2']})\n",
    "\n",
    "    forecast.append(reduce(lambda x,y: pd.merge(x,y,\n",
    "                                       on=['permno','YearMonth'],\n",
    "                                       how='outer'),\n",
    "                 [forecast_q1,forecast_q2,forecast_q3,\n",
    "                  forecast_y1,forecast_y2]))\n",
    "    # break\n",
    "forecast_all = pd.concat(forecast,axis=0).reset_index(drop=True)\n",
    "forecast_all.to_parquet(f'../data/Results/RF_variants/RF_{max_samples}_{window}.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [28:36<00:00,  4.21s/it]\n"
     ]
    }
   ],
   "source": [
    "###############################\n",
    "## With Per-share variables ###\n",
    "###############################\n",
    "time_idx = sorted(df_tmp['YearMonth'].unique())\n",
    "time_idx = [i for i in time_idx if i > pd.to_datetime('1986-01-01')]\n",
    "max_samples = 0.05\n",
    "window = 12\n",
    "n_jobs = 12\n",
    "num_trees = 2000\n",
    "forecast = []\n",
    "for t in tqdm(time_idx):\n",
    "\n",
    "    ### Q1 ###\n",
    "    x_cols = ratio_chars + per_share_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q1'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q1'] + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "    # break\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q1':y_pred, 'AF_q1':df_test['EPS_ana_q1'], 'AE_q1':df_test['EPS_true_q1']})\n",
    "\n",
    "    ### Q2 ###\n",
    "    x_cols = ratio_chars + per_share_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q2'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q2']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q2':y_pred, 'AF_q2':df_test['EPS_ana_q2'], 'AE_q2':df_test['EPS_true_q2']})\n",
    "\n",
    "    ### Q3 ###\n",
    "    x_cols = ratio_chars + per_share_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q3'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q3'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q3']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q3']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q3 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q3':y_pred, 'AF_q3':df_test['EPS_ana_q3'], 'AE_q3':df_test['EPS_true_q3']})\n",
    "\n",
    "    ### Y1 ###\n",
    "    x_cols = ratio_chars + per_share_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_y1'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y1']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_y1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_y1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_y1':y_pred, 'AF_y1':df_test['EPS_ana_y1'], 'AE_y1':df_test['EPS_true_y1']})\n",
    "\n",
    "    ### Y2 ###\n",
    "    x_cols = ratio_chars + per_share_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_y2'\n",
    "    if window <= 24:\n",
    "        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(24)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\\\n",
    "                   .dropna(subset=x_cols+[y_col])\n",
    "    else:\n",
    "        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\\\n",
    "                   .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_y2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_y2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_y2':y_pred, 'AF_y2':df_test['EPS_ana_y2'], 'AE_y2':df_test['EPS_true_y2']})\n",
    "\n",
    "    forecast.append(reduce(lambda x,y: pd.merge(x,y,\n",
    "                                       on=['permno','YearMonth'],\n",
    "                                       how='outer'),\n",
    "                 [forecast_q1,forecast_q2,forecast_q3,\n",
    "                  forecast_y1,forecast_y2]))\n",
    "    # break\n",
    "forecast_all = pd.concat(forecast,axis=0).reset_index(drop=True)\n",
    "forecast_all.to_parquet(f'../data/Results/RF_variants/RF_{max_samples}_{window}_chars.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [23:29<00:00,  3.46s/it]\n"
     ]
    }
   ],
   "source": [
    "##############################\n",
    "### Standardization: Rank ####\n",
    "##############################\n",
    "df_tmp = pd.read_parquet('../data/Results/df_train_new.parquet')\n",
    "## CS-Rank\n",
    "cols = ratio_chars + per_share_chars + fundamental_chars + analyst_chars\n",
    "df_tmp[cols] = df_tmp.groupby('YearMonth',group_keys=False)[cols]\\\n",
    "                             .transform(lambda x: x.rank(pct=True)*2 - 1)\n",
    "time_idx = sorted(df_tmp['YearMonth'].unique())\n",
    "time_idx = [i for i in time_idx if i > pd.to_datetime('1986-01-01')]\n",
    "max_samples = 0.05\n",
    "window = 12\n",
    "n_jobs = 12\n",
    "num_trees = 2000\n",
    "forecast = []\n",
    "for t in tqdm(time_idx):\n",
    "\n",
    "    ### Q1 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q1'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q1'] + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "    # break\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q1':y_pred, 'AF_q1':df_test['EPS_ana_q1'], 'AE_q1':df_test['EPS_true_q1']})\n",
    "\n",
    "    ### Q2 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q2'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q2']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q2':y_pred, 'AF_q2':df_test['EPS_ana_q2'], 'AE_q2':df_test['EPS_true_q2']})\n",
    "\n",
    "    ### Q3 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q3'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q3'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q3']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q3']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q3 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q3':y_pred, 'AF_q3':df_test['EPS_ana_q3'], 'AE_q3':df_test['EPS_true_q3']})\n",
    "\n",
    "    ### Y1 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_y1'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y1']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_y1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_y1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_y1':y_pred, 'AF_y1':df_test['EPS_ana_y1'], 'AE_y1':df_test['EPS_true_y1']})\n",
    "\n",
    "    ### Y2 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_y2'\n",
    "    if window <= 24:\n",
    "        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(24)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\\\n",
    "                   .dropna(subset=x_cols+[y_col])\n",
    "    else:\n",
    "        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\\\n",
    "                   .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_y2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_y2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_y2':y_pred, 'AF_y2':df_test['EPS_ana_y2'], 'AE_y2':df_test['EPS_true_y2']})\n",
    "\n",
    "    forecast.append(reduce(lambda x,y: pd.merge(x,y,\n",
    "                                       on=['permno','YearMonth'],\n",
    "                                       how='outer'),\n",
    "                 [forecast_q1,forecast_q2,forecast_q3,\n",
    "                  forecast_y1,forecast_y2]))\n",
    "    # break\n",
    "forecast_all = pd.concat(forecast,axis=0).reset_index(drop=True)\n",
    "forecast_all.to_parquet(f'../data/Results/RF_variants/RF_{max_samples}_{window}_rank.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [23:35<00:00,  3.47s/it]\n"
     ]
    }
   ],
   "source": [
    "###############################\n",
    "### Standardization: Z-score ##\n",
    "###############################\n",
    "df_tmp = pd.read_parquet('../data/Results/df_train_new.parquet')\n",
    "## Z-score\n",
    "cols = ratio_chars + per_share_chars + fundamental_chars + analyst_chars\n",
    "df_tmp[cols] = df_tmp.groupby('YearMonth',group_keys=False)[cols]\\\n",
    "                             .transform(lambda x: (x - x.mean()) / x.std())\n",
    "time_idx = sorted(df_tmp['YearMonth'].unique())\n",
    "time_idx = [i for i in time_idx if i > pd.to_datetime('1986-01-01')]\n",
    "max_samples = 0.05\n",
    "window = 12\n",
    "n_jobs = 12\n",
    "num_trees = 2000\n",
    "forecast = []\n",
    "for t in tqdm(time_idx):\n",
    "\n",
    "    ### Q1 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q1'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q1'] + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "    # break\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q1':y_pred, 'AF_q1':df_test['EPS_ana_q1'], 'AE_q1':df_test['EPS_true_q1']})\n",
    "\n",
    "    ### Q2 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q2'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q2']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q2':y_pred, 'AF_q2':df_test['EPS_ana_q2'], 'AE_q2':df_test['EPS_true_q2']})\n",
    "\n",
    "    ### Q3 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q3'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_q3'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_q3']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_q3']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_q3 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_q3':y_pred, 'AF_q3':df_test['EPS_ana_q3'], 'AE_q3':df_test['EPS_true_q3']})\n",
    "\n",
    "    ### Y1 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_y1'\n",
    "\n",
    "    df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y1']  + MonthEnd(0) < t)]\\\n",
    "               .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_y1']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_y1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_y1':y_pred, 'AF_y1':df_test['EPS_ana_y1'], 'AE_y1':df_test['EPS_true_y1']})\n",
    "\n",
    "    ### Y2 ###\n",
    "    x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "    y_col = 'EPS_true_y2'\n",
    "    if window <= 24:\n",
    "        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(24)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\\\n",
    "                   .dropna(subset=x_cols+[y_col])\n",
    "    else:\n",
    "        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(window)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\\\n",
    "                   .dropna(subset=x_cols+[y_col])\n",
    "    df_test = df_tmp[(df_tmp['ANNDATS_y2']>df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "    mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                             random_state=0,\n",
    "                             max_depth=7,\n",
    "                             min_samples_leaf=5,\n",
    "                             max_samples=max_samples,\n",
    "                             n_jobs=n_jobs)\n",
    "\n",
    "    mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "    y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "    forecast_y2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                'RF_y2':y_pred, 'AF_y2':df_test['EPS_ana_y2'], 'AE_y2':df_test['EPS_true_y2']})\n",
    "\n",
    "    forecast.append(reduce(lambda x,y: pd.merge(x,y,\n",
    "                                       on=['permno','YearMonth'],\n",
    "                                       how='outer'),\n",
    "                 [forecast_q1,forecast_q2,forecast_q3,\n",
    "                  forecast_y1,forecast_y2]))\n",
    "    # break\n",
    "forecast_all = pd.concat(forecast,axis=0).reset_index(drop=True)\n",
    "forecast_all.to_parquet(f'../data/Results/RF_variants/RF_{max_samples}_{window}_zscore.parquet')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# All Permutation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_tmp_raw = pd.read_parquet('../data/Results/df_train_new.parquet')\n",
    "cols = ratio_chars + per_share_chars + fundamental_chars + analyst_chars\n",
    "\n",
    "df_tmp_rank = df_tmp_raw.copy()\n",
    "df_tmp_rank[cols] = df_tmp_rank.groupby('YearMonth',group_keys=False)[cols]\\\n",
    "                               .transform(lambda x: x.rank(pct=True)*2 - 1)\n",
    "\n",
    "df_tmp_zscore = df_tmp_raw.copy()\n",
    "df_tmp_zscore[cols] = df_tmp_zscore.groupby('YearMonth',group_keys=False)[cols]\\\n",
    "                                   .transform(lambda x: (x - x.mean()) / x.std())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "param_grid = ParameterGrid({'sample': [0.05, 0.10],\n",
    "                            'feature': ['sqrt', 1.0],\n",
    "                            'window': [12, 24],\n",
    "                            'std': ['zscore', 'raw', 'rank',], #\n",
    "                            'eps': [True, False],\n",
    "                            })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.05; feature=sqrt; window=12; std=zscore; eps=True\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [09:27<00:00,  1.39s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.05; feature=sqrt; window=24; std=zscore; eps=True\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [14:21<00:00,  2.11s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.05; feature=sqrt; window=12; std=raw; eps=True\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [09:12<00:00,  1.35s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.05; feature=sqrt; window=24; std=raw; eps=True\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [14:30<00:00,  2.13s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.05; feature=sqrt; window=12; std=rank; eps=True\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [09:13<00:00,  1.36s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.05; feature=sqrt; window=24; std=rank; eps=True\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [14:35<00:00,  2.15s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.1; feature=sqrt; window=12; std=zscore; eps=True\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [11:06<00:00,  1.63s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.1; feature=sqrt; window=24; std=zscore; eps=True\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [18:26<00:00,  2.71s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.1; feature=sqrt; window=12; std=raw; eps=True\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [11:15<00:00,  1.66s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.1; feature=sqrt; window=24; std=raw; eps=True\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [17:21<00:00,  2.55s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.1; feature=sqrt; window=12; std=rank; eps=True\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [10:16<00:00,  1.51s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.1; feature=sqrt; window=24; std=rank; eps=True\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [16:51<00:00,  2.48s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.05; feature=1.0; window=12; std=zscore; eps=True\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [28:25<00:00,  4.18s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.05; feature=1.0; window=24; std=zscore; eps=True\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [51:46<00:00,  7.61s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.05; feature=1.0; window=12; std=raw; eps=True\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [28:36<00:00,  4.21s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.05; feature=1.0; window=24; std=raw; eps=True\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [52:25<00:00,  7.71s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.05; feature=1.0; window=12; std=rank; eps=True\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [29:03<00:00,  4.27s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.05; feature=1.0; window=24; std=rank; eps=True\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [51:34<00:00,  7.58s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.1; feature=1.0; window=12; std=zscore; eps=True\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [36:37<00:00,  5.39s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.1; feature=1.0; window=24; std=zscore; eps=True\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [1:04:40<00:00,  9.51s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.1; feature=1.0; window=12; std=raw; eps=True\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [35:57<00:00,  5.29s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.1; feature=1.0; window=24; std=raw; eps=True\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [1:04:34<00:00,  9.50s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.1; feature=1.0; window=12; std=rank; eps=True\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [38:04<00:00,  5.60s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.1; feature=1.0; window=24; std=rank; eps=True\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [1:08:39<00:00, 10.10s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.05; feature=sqrt; window=12; std=zscore; eps=False\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [08:00<00:00,  1.18s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.05; feature=sqrt; window=24; std=zscore; eps=False\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [12:12<00:00,  1.79s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.05; feature=sqrt; window=12; std=raw; eps=False\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [08:03<00:00,  1.18s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.05; feature=sqrt; window=24; std=raw; eps=False\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [13:45<00:00,  2.02s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.05; feature=sqrt; window=12; std=rank; eps=False\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [08:06<00:00,  1.19s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.05; feature=sqrt; window=24; std=rank; eps=False\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [11:33<00:00,  1.70s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.1; feature=sqrt; window=12; std=zscore; eps=False\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [08:39<00:00,  1.27s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.1; feature=sqrt; window=24; std=zscore; eps=False\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [13:20<00:00,  1.96s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.1; feature=sqrt; window=12; std=raw; eps=False\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [08:38<00:00,  1.27s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.1; feature=sqrt; window=24; std=raw; eps=False\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [13:19<00:00,  1.96s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.1; feature=sqrt; window=12; std=rank; eps=False\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [08:41<00:00,  1.28s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.1; feature=sqrt; window=24; std=rank; eps=False\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [13:25<00:00,  1.97s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.05; feature=1.0; window=12; std=zscore; eps=False\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [23:16<00:00,  3.42s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.05; feature=1.0; window=24; std=zscore; eps=False\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [43:21<00:00,  6.38s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.05; feature=1.0; window=12; std=raw; eps=False\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [25:00<00:00,  3.68s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.05; feature=1.0; window=24; std=raw; eps=False\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [40:22<00:00,  5.94s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.05; feature=1.0; window=12; std=rank; eps=False\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [21:56<00:00,  3.23s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.05; feature=1.0; window=24; std=rank; eps=False\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [36:00<00:00,  5.30s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.1; feature=1.0; window=12; std=zscore; eps=False\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [30:47<00:00,  4.53s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.1; feature=1.0; window=24; std=zscore; eps=False\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [55:48<00:00,  8.21s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.1; feature=1.0; window=12; std=raw; eps=False\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [30:48<00:00,  4.53s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.1; feature=1.0; window=24; std=raw; eps=False\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [1:01:13<00:00,  9.00s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.1; feature=1.0; window=12; std=rank; eps=False\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [33:22<00:00,  4.91s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current Parameter: sample=0.1; feature=1.0; window=24; std=rank; eps=False\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 408/408 [59:25<00:00,  8.74s/it]\n"
     ]
    }
   ],
   "source": [
    "time_idx = sorted(df_tmp_raw['YearMonth'].unique())\n",
    "time_idx = [i for i in time_idx if i > pd.to_datetime('1986-01-01')]\n",
    "\n",
    "n_jobs = 12\n",
    "num_trees = 2000\n",
    "\n",
    "for params in param_grid:\n",
    "\n",
    "    print(f\"Current Parameter: sample={params['sample']}; feature={params['feature']}; window={params['window']}; std={params['std']}; eps={params['eps']}\")\n",
    "\n",
    "    forecast = []\n",
    "    if params['std'] == 'raw':\n",
    "        df_tmp = df_tmp_raw.copy()\n",
    "    if params['std'] == 'rank':\n",
    "        df_tmp = df_tmp_rank.copy()\n",
    "    if params['std'] == 'zscore':\n",
    "        df_tmp = df_tmp_zscore.copy()\n",
    "\n",
    "    for t in tqdm(time_idx):\n",
    "\n",
    "        ### Q1 ###\n",
    "        x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "        if params['eps']:\n",
    "            x_cols = x_cols + per_share_chars\n",
    "        y_col = 'EPS_true_q1'\n",
    "\n",
    "        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(params['window'])) & (df_tmp['ANNDATS_q1'] + MonthEnd(0) < t)]\\\n",
    "                   .dropna(subset=x_cols+[y_col])\n",
    "        df_test = df_tmp[(df_tmp['ANNDATS_q1'] > df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "        # break\n",
    "        mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                                     random_state=0,\n",
    "                                     max_depth=7,\n",
    "                                     min_samples_leaf=5,\n",
    "                                     max_samples=params['sample'],\n",
    "                                     max_features=params['feature'],\n",
    "                                     n_jobs=n_jobs)\n",
    "\n",
    "        mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "        y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "        forecast_q1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                    'RF_q1':y_pred, 'AF_q1':df_test['EPS_ana_q1'], 'AE_q1':df_test['EPS_true_q1']})\n",
    "\n",
    "        ### Q2 ###\n",
    "        x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "        if params['eps']:\n",
    "            x_cols = x_cols + per_share_chars\n",
    "        y_col = 'EPS_true_q2'\n",
    "\n",
    "        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(params['window'])) & (df_tmp['ANNDATS_q2']  + MonthEnd(0) < t)]\\\n",
    "                   .dropna(subset=x_cols+[y_col])\n",
    "        df_test = df_tmp[(df_tmp['ANNDATS_q2'] > df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "        mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                                     random_state=0,\n",
    "                                     max_depth=7,\n",
    "                                     min_samples_leaf=5,\n",
    "                                     max_samples=params['sample'],\n",
    "                                     max_features=params['feature'],\n",
    "                                     n_jobs=n_jobs)\n",
    "\n",
    "        mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "        y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "        forecast_q2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                    'RF_q2':y_pred, 'AF_q2':df_test['EPS_ana_q2'], 'AE_q2':df_test['EPS_true_q2']})\n",
    "\n",
    "        ### Q3 ###\n",
    "        x_cols = ratio_chars + ['ret','prc','EPS_true_l1_q1','EPS_ana_q3'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "        if params['eps']:\n",
    "            x_cols = x_cols + per_share_chars\n",
    "        y_col = 'EPS_true_q3'\n",
    "\n",
    "        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(params['window'])) & (df_tmp['ANNDATS_q3']  + MonthEnd(0) < t)]\\\n",
    "                   .dropna(subset=x_cols+[y_col])\n",
    "        df_test = df_tmp[(df_tmp['ANNDATS_q3'] > df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "        mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                                     random_state=0,\n",
    "                                     max_depth=7,\n",
    "                                     min_samples_leaf=5,\n",
    "                                     max_samples=params['sample'],\n",
    "                                     max_features=params['feature'],\n",
    "                                     n_jobs=n_jobs)\n",
    "\n",
    "        mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "        y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "        forecast_q3 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                    'RF_q3':y_pred, 'AF_q3':df_test['EPS_ana_q3'], 'AE_q3':df_test['EPS_true_q3']})\n",
    "\n",
    "        ### Y1 ###\n",
    "        x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y1'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "        if params['eps']:\n",
    "            x_cols = x_cols + per_share_chars\n",
    "        y_col = 'EPS_true_y1'\n",
    "\n",
    "        df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(params['window'])) & (df_tmp['ANNDATS_y1']  + MonthEnd(0) < t)]\\\n",
    "                   .dropna(subset=x_cols+[y_col])\n",
    "        df_test = df_tmp[(df_tmp['ANNDATS_y1'] > df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "        mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                                     random_state=0,\n",
    "                                     max_depth=7,\n",
    "                                     min_samples_leaf=5,\n",
    "                                     max_samples=params['sample'],\n",
    "                                     max_features=params['feature'],\n",
    "                                     n_jobs=n_jobs)\n",
    "\n",
    "        mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "        y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "        forecast_y1 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                    'RF_y1':y_pred, 'AF_y1':df_test['EPS_ana_y1'], 'AE_y1':df_test['EPS_true_y1']})\n",
    "\n",
    "        ### Y2 ###\n",
    "        x_cols = ratio_chars + ['ret','prc','EPS_true_l1_y1','EPS_ana_y2'] + ['RGDP', 'RCON', 'INDPROD', 'UNEMP']\n",
    "        if params['eps']:\n",
    "            x_cols = x_cols + per_share_chars\n",
    "        y_col = 'EPS_true_y2'\n",
    "\n",
    "        if params['window'] <= 24:\n",
    "            df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(24)) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\\\n",
    "                       .dropna(subset=x_cols+[y_col])\n",
    "        else:\n",
    "            df_train = df_tmp[(df_tmp['YearMonth'] < t) & (df_tmp['YearMonth'] >= t - MonthEnd(params['window'])) & (df_tmp['ANNDATS_y2']  + MonthEnd(0) < t)]\\\n",
    "                       .dropna(subset=x_cols+[y_col])\n",
    "        df_test = df_tmp[(df_tmp['ANNDATS_y2'] > df_tmp['YearMonth']) & (df_tmp['YearMonth'] == t)].dropna(subset=x_cols+[y_col])\n",
    "\n",
    "        mdl = RandomForestRegressor(n_estimators=num_trees,\n",
    "                                     random_state=0,\n",
    "                                     max_depth=7,\n",
    "                                     min_samples_leaf=5,\n",
    "                                     max_samples=params['sample'],\n",
    "                                     max_features=params['feature'],\n",
    "                                     n_jobs=n_jobs)\n",
    "\n",
    "        mdl.fit(df_train[x_cols], df_train[y_col])\n",
    "        y_pred = mdl.predict(df_test[x_cols])\n",
    "\n",
    "        forecast_y2 = pd.DataFrame({'permno':df_test['permno'],'YearMonth':df_test['YearMonth'],\n",
    "                                    'RF_y2':y_pred, 'AF_y2':df_test['EPS_ana_y2'], 'AE_y2':df_test['EPS_true_y2']})\n",
    "\n",
    "        forecast.append(reduce(lambda x,y: pd.merge(x,y,\n",
    "                                           on=['permno','YearMonth'],\n",
    "                                           how='outer'),\n",
    "                     [forecast_q1,forecast_q2,forecast_q3,\n",
    "                      forecast_y1,forecast_y2]))\n",
    "        # break\n",
    "    forecast_all = pd.concat(forecast,axis=0).reset_index(drop=True)\n",
    "    forecast_all.to_parquet(f\"../data/Results/RF_variants_ALL/RF_sample_{params['sample']}_feature_{params['feature']}_window_{params['window']}_std_{params['std']}_eps_{params['eps']}.parquet\")\n",
    "    # break"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Table F.1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_tmp = pd.read_parquet('../data/Results/df_train_new.parquet')\n",
    "f_abbr_list = [('RF_wo_lookahead_raw_005','baseline'),\n",
    "               ('RF_0.01_12','001'),\n",
    "               ('RF_0.1_12','010'),\n",
    "               ('RF_0.05_12_feature_sqrt','sqrt'),\n",
    "               ('RF_0.05_12_feature_0.5','0.5'),\n",
    "               ('RF_0.05_36','36m'),\n",
    "               ('RF_0.05_60','60m'),\n",
    "               ('RF_0.05_12_rank','rank'),\n",
    "               ('RF_0.05_12_zscore','zscore'),\n",
    "               ('RF_0.05_12_chars','chars'),\n",
    "              ]\n",
    "abbr_list = list(map(lambda x: x[1], f_abbr_list))\n",
    "\n",
    "forecast_all = []\n",
    "for f,abbr in f_abbr_list:\n",
    "    RF = pd.read_parquet(f'../data/Results/RF_variants/{f}.parquet')\n",
    "    RF = RF[['permno','YearMonth','RF_q1','RF_q2','RF_q3','RF_y1','RF_y2',]].set_index(['permno','YearMonth'])\n",
    "    RF.columns = [f'{i}_{abbr}' for i in RF.columns]\n",
    "    forecast_all.append(RF)\n",
    "\n",
    "forecast_all = reduce(lambda x,y: pd.merge(x,y,on=['permno','YearMonth'],how='outer'),\n",
    "                      forecast_all)\n",
    "forecast_all.reset_index(inplace=True)\n",
    "\n",
    "df = df_tmp.merge(forecast_all, on=['permno','YearMonth'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>baseline</th>\n",
       "      <th>001</th>\n",
       "      <th>010</th>\n",
       "      <th>sqrt</th>\n",
       "      <th>0.5</th>\n",
       "      <th>36m</th>\n",
       "      <th>60m</th>\n",
       "      <th>rank</th>\n",
       "      <th>zscore</th>\n",
       "      <th>chars</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>q1</th>\n",
       "      <td>0.071387</td>\n",
       "      <td>0.074379</td>\n",
       "      <td>0.071334</td>\n",
       "      <td>0.107544</td>\n",
       "      <td>0.073505</td>\n",
       "      <td>0.071662</td>\n",
       "      <td>0.071911</td>\n",
       "      <td>0.073049</td>\n",
       "      <td>0.072873</td>\n",
       "      <td>0.071612</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>q2</th>\n",
       "      <td>0.090191</td>\n",
       "      <td>0.094819</td>\n",
       "      <td>0.090169</td>\n",
       "      <td>0.122879</td>\n",
       "      <td>0.092405</td>\n",
       "      <td>0.090974</td>\n",
       "      <td>0.090943</td>\n",
       "      <td>0.090589</td>\n",
       "      <td>0.090434</td>\n",
       "      <td>0.09057</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>q3</th>\n",
       "      <td>0.11964</td>\n",
       "      <td>0.128894</td>\n",
       "      <td>0.119717</td>\n",
       "      <td>0.149938</td>\n",
       "      <td>0.122092</td>\n",
       "      <td>0.120189</td>\n",
       "      <td>0.119895</td>\n",
       "      <td>0.118637</td>\n",
       "      <td>0.118271</td>\n",
       "      <td>0.119795</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>y1</th>\n",
       "      <td>0.601927</td>\n",
       "      <td>0.705898</td>\n",
       "      <td>0.600189</td>\n",
       "      <td>1.019591</td>\n",
       "      <td>0.640636</td>\n",
       "      <td>0.60094</td>\n",
       "      <td>0.601187</td>\n",
       "      <td>0.628581</td>\n",
       "      <td>0.625161</td>\n",
       "      <td>0.603321</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>y2</th>\n",
       "      <td>1.80457</td>\n",
       "      <td>1.926102</td>\n",
       "      <td>1.805712</td>\n",
       "      <td>2.123661</td>\n",
       "      <td>1.82427</td>\n",
       "      <td>1.797241</td>\n",
       "      <td>1.757604</td>\n",
       "      <td>1.773479</td>\n",
       "      <td>1.775499</td>\n",
       "      <td>1.810418</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    baseline       001       010      sqrt       0.5       36m       60m  \\\n",
       "q1  0.071387  0.074379  0.071334  0.107544  0.073505  0.071662  0.071911   \n",
       "q2  0.090191  0.094819  0.090169  0.122879  0.092405  0.090974  0.090943   \n",
       "q3   0.11964  0.128894  0.119717  0.149938  0.122092  0.120189  0.119895   \n",
       "y1  0.601927  0.705898  0.600189  1.019591  0.640636   0.60094  0.601187   \n",
       "y2   1.80457  1.926102  1.805712  2.123661   1.82427  1.797241  1.757604   \n",
       "\n",
       "        rank    zscore     chars  \n",
       "q1  0.073049  0.072873  0.071612  \n",
       "q2  0.090589  0.090434   0.09057  \n",
       "q3  0.118637  0.118271  0.119795  \n",
       "y1  0.628581  0.625161  0.603321  \n",
       "y2  1.773479  1.775499  1.810418  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "### Panel A of Table F.1\n",
    "### 1. Forecast Performance (RF-AE)**2\n",
    "idx = ['q1','q2','q3','y1','y2']\n",
    "col = abbr_list\n",
    "MSE = pd.DataFrame(index=idx, columns=col)\n",
    "## To make sure we have the same sample\n",
    "N_obs = pd.DataFrame(index=idx, columns=col)\n",
    "for c in col:\n",
    "    for i in idx:\n",
    "        df_ = df.dropna(subset=[f'EPS_true_{i}',f'RF_{i}_{c}'])\n",
    "        MSE.loc[i, c] = df_.groupby('YearMonth').apply(lambda x: np.mean((x[f'EPS_true_{i}']-x[f'RF_{i}_{c}'])**2)\n",
    "                                                        ).mean()\n",
    "        N_obs.loc[i,c] = df_.shape[0]\n",
    "MSE.round(3).to_clipboard()\n",
    "MSE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_factor = pd.read_csv('../data/Other/ff5_factors_m.CSV')\n",
    "all_factor['YearMonth'] = pd.to_datetime(all_factor['yyyymm'], format='%Y%m') + MonthEnd(0)\n",
    "all_factor['YearMonth'] = all_factor['YearMonth'] + MonthEnd(-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Var:baseline_Bias_Avg, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:001_Bias_Avg, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:010_Bias_Avg, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:sqrt_Bias_Avg, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:0.5_Bias_Avg, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:36m_Bias_Avg, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:60m_Bias_Avg, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:rank_Bias_Avg, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:zscore_Bias_Avg, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:chars_Bias_Avg, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>baseline</th>\n",
       "      <th>001</th>\n",
       "      <th>010</th>\n",
       "      <th>sqrt</th>\n",
       "      <th>0.5</th>\n",
       "      <th>36m</th>\n",
       "      <th>60m</th>\n",
       "      <th>rank</th>\n",
       "      <th>zscore</th>\n",
       "      <th>chars</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Ret</th>\n",
       "      <td>-0.25</td>\n",
       "      <td>-0.08</td>\n",
       "      <td>-0.20</td>\n",
       "      <td>0.16</td>\n",
       "      <td>-0.18</td>\n",
       "      <td>-0.26</td>\n",
       "      <td>-0.24</td>\n",
       "      <td>-0.19</td>\n",
       "      <td>-0.12</td>\n",
       "      <td>-0.23</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>(-0.65)</td>\n",
       "      <td>(-0.23)</td>\n",
       "      <td>(-0.56)</td>\n",
       "      <td>(0.69)</td>\n",
       "      <td>(-0.52)</td>\n",
       "      <td>(-0.69)</td>\n",
       "      <td>(-0.69)</td>\n",
       "      <td>(-0.52)</td>\n",
       "      <td>(-0.35)</td>\n",
       "      <td>(-0.62)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CAPM</th>\n",
       "      <td>-0.67</td>\n",
       "      <td>-0.47</td>\n",
       "      <td>-0.61</td>\n",
       "      <td>0.04</td>\n",
       "      <td>-0.58</td>\n",
       "      <td>-0.70</td>\n",
       "      <td>-0.68</td>\n",
       "      <td>-0.52</td>\n",
       "      <td>-0.45</td>\n",
       "      <td>-0.65</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>(-1.90)</td>\n",
       "      <td>(-1.53)</td>\n",
       "      <td>(-1.83)</td>\n",
       "      <td>(0.18)</td>\n",
       "      <td>(-1.74)</td>\n",
       "      <td>(-2.05)</td>\n",
       "      <td>(-2.18)</td>\n",
       "      <td>(-1.37)</td>\n",
       "      <td>(-1.26)</td>\n",
       "      <td>(-1.90)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>FF3</th>\n",
       "      <td>-0.76</td>\n",
       "      <td>-0.51</td>\n",
       "      <td>-0.71</td>\n",
       "      <td>-0.10</td>\n",
       "      <td>-0.68</td>\n",
       "      <td>-0.80</td>\n",
       "      <td>-0.78</td>\n",
       "      <td>-0.69</td>\n",
       "      <td>-0.63</td>\n",
       "      <td>-0.74</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>(-3.25)</td>\n",
       "      <td>(-2.49)</td>\n",
       "      <td>(-3.29)</td>\n",
       "      <td>(-0.52)</td>\n",
       "      <td>(-2.98)</td>\n",
       "      <td>(-3.52)</td>\n",
       "      <td>(-3.86)</td>\n",
       "      <td>(-2.70)</td>\n",
       "      <td>(-2.53)</td>\n",
       "      <td>(-3.18)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>FF5</th>\n",
       "      <td>-0.41</td>\n",
       "      <td>-0.10</td>\n",
       "      <td>-0.36</td>\n",
       "      <td>0.08</td>\n",
       "      <td>-0.29</td>\n",
       "      <td>-0.44</td>\n",
       "      <td>-0.35</td>\n",
       "      <td>-0.49</td>\n",
       "      <td>-0.39</td>\n",
       "      <td>-0.39</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>(-1.42)</td>\n",
       "      <td>(-0.34)</td>\n",
       "      <td>(-1.30)</td>\n",
       "      <td>(0.36)</td>\n",
       "      <td>(-1.09)</td>\n",
       "      <td>(-1.62)</td>\n",
       "      <td>(-1.41)</td>\n",
       "      <td>(-1.87)</td>\n",
       "      <td>(-1.26)</td>\n",
       "      <td>(-1.36)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>FFC6</th>\n",
       "      <td>-0.02</td>\n",
       "      <td>0.28</td>\n",
       "      <td>0.03</td>\n",
       "      <td>0.29</td>\n",
       "      <td>0.08</td>\n",
       "      <td>-0.05</td>\n",
       "      <td>0.02</td>\n",
       "      <td>-0.10</td>\n",
       "      <td>0.02</td>\n",
       "      <td>0.01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>(-0.09)</td>\n",
       "      <td>(1.27)</td>\n",
       "      <td>(0.16)</td>\n",
       "      <td>(1.58)</td>\n",
       "      <td>(0.36)</td>\n",
       "      <td>(-0.25)</td>\n",
       "      <td>(0.10)</td>\n",
       "      <td>(-0.39)</td>\n",
       "      <td>(0.07)</td>\n",
       "      <td>(0.03)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HXZ</th>\n",
       "      <td>-0.08</td>\n",
       "      <td>0.25</td>\n",
       "      <td>-0.03</td>\n",
       "      <td>0.16</td>\n",
       "      <td>-0.04</td>\n",
       "      <td>-0.13</td>\n",
       "      <td>-0.11</td>\n",
       "      <td>-0.30</td>\n",
       "      <td>-0.22</td>\n",
       "      <td>-0.05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>(-0.25)</td>\n",
       "      <td>(0.90)</td>\n",
       "      <td>(-0.11)</td>\n",
       "      <td>(0.61)</td>\n",
       "      <td>(-0.14)</td>\n",
       "      <td>(-0.48)</td>\n",
       "      <td>(-0.42)</td>\n",
       "      <td>(-0.90)</td>\n",
       "      <td>(-0.57)</td>\n",
       "      <td>(-0.17)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HMXZ</th>\n",
       "      <td>-0.01</td>\n",
       "      <td>0.13</td>\n",
       "      <td>0.04</td>\n",
       "      <td>-0.02</td>\n",
       "      <td>-0.04</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.06</td>\n",
       "      <td>-0.33</td>\n",
       "      <td>-0.25</td>\n",
       "      <td>0.02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>(-0.02)</td>\n",
       "      <td>(0.45)</td>\n",
       "      <td>(0.14)</td>\n",
       "      <td>(-0.08)</td>\n",
       "      <td>(-0.14)</td>\n",
       "      <td>(0.00)</td>\n",
       "      <td>(0.24)</td>\n",
       "      <td>(-1.08)</td>\n",
       "      <td>(-0.69)</td>\n",
       "      <td>(0.06)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>SY</th>\n",
       "      <td>0.07</td>\n",
       "      <td>0.29</td>\n",
       "      <td>0.12</td>\n",
       "      <td>0.23</td>\n",
       "      <td>0.13</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.10</td>\n",
       "      <td>-0.14</td>\n",
       "      <td>-0.02</td>\n",
       "      <td>0.10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>(0.30)</td>\n",
       "      <td>(1.09)</td>\n",
       "      <td>(0.56)</td>\n",
       "      <td>(0.92)</td>\n",
       "      <td>(0.59)</td>\n",
       "      <td>(0.02)</td>\n",
       "      <td>(0.52)</td>\n",
       "      <td>(-0.56)</td>\n",
       "      <td>(-0.09)</td>\n",
       "      <td>(0.46)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>DHS</th>\n",
       "      <td>0.31</td>\n",
       "      <td>0.47</td>\n",
       "      <td>0.32</td>\n",
       "      <td>0.17</td>\n",
       "      <td>0.29</td>\n",
       "      <td>0.29</td>\n",
       "      <td>0.34</td>\n",
       "      <td>0.21</td>\n",
       "      <td>0.23</td>\n",
       "      <td>0.33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>(0.74)</td>\n",
       "      <td>(1.17)</td>\n",
       "      <td>(0.81)</td>\n",
       "      <td>(0.71)</td>\n",
       "      <td>(0.74)</td>\n",
       "      <td>(0.73)</td>\n",
       "      <td>(0.93)</td>\n",
       "      <td>(0.52)</td>\n",
       "      <td>(0.56)</td>\n",
       "      <td>(0.80)</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     baseline      001      010     sqrt      0.5      36m      60m     rank  \\\n",
       "Ret     -0.25    -0.08    -0.20     0.16    -0.18    -0.26    -0.24    -0.19   \n",
       "      (-0.65)  (-0.23)  (-0.56)   (0.69)  (-0.52)  (-0.69)  (-0.69)  (-0.52)   \n",
       "CAPM    -0.67    -0.47    -0.61     0.04    -0.58    -0.70    -0.68    -0.52   \n",
       "      (-1.90)  (-1.53)  (-1.83)   (0.18)  (-1.74)  (-2.05)  (-2.18)  (-1.37)   \n",
       "FF3     -0.76    -0.51    -0.71    -0.10    -0.68    -0.80    -0.78    -0.69   \n",
       "      (-3.25)  (-2.49)  (-3.29)  (-0.52)  (-2.98)  (-3.52)  (-3.86)  (-2.70)   \n",
       "FF5     -0.41    -0.10    -0.36     0.08    -0.29    -0.44    -0.35    -0.49   \n",
       "      (-1.42)  (-0.34)  (-1.30)   (0.36)  (-1.09)  (-1.62)  (-1.41)  (-1.87)   \n",
       "FFC6    -0.02     0.28     0.03     0.29     0.08    -0.05     0.02    -0.10   \n",
       "      (-0.09)   (1.27)   (0.16)   (1.58)   (0.36)  (-0.25)   (0.10)  (-0.39)   \n",
       "HXZ     -0.08     0.25    -0.03     0.16    -0.04    -0.13    -0.11    -0.30   \n",
       "      (-0.25)   (0.90)  (-0.11)   (0.61)  (-0.14)  (-0.48)  (-0.42)  (-0.90)   \n",
       "HMXZ    -0.01     0.13     0.04    -0.02    -0.04     0.00     0.06    -0.33   \n",
       "      (-0.02)   (0.45)   (0.14)  (-0.08)  (-0.14)   (0.00)   (0.24)  (-1.08)   \n",
       "SY       0.07     0.29     0.12     0.23     0.13     0.00     0.10    -0.14   \n",
       "       (0.30)   (1.09)   (0.56)   (0.92)   (0.59)   (0.02)   (0.52)  (-0.56)   \n",
       "DHS      0.31     0.47     0.32     0.17     0.29     0.29     0.34     0.21   \n",
       "       (0.74)   (1.17)   (0.81)   (0.71)   (0.74)   (0.73)   (0.93)   (0.52)   \n",
       "\n",
       "       zscore    chars  \n",
       "Ret     -0.12    -0.23  \n",
       "      (-0.35)  (-0.62)  \n",
       "CAPM    -0.45    -0.65  \n",
       "      (-1.26)  (-1.90)  \n",
       "FF3     -0.63    -0.74  \n",
       "      (-2.53)  (-3.18)  \n",
       "FF5     -0.39    -0.39  \n",
       "      (-1.26)  (-1.36)  \n",
       "FFC6     0.02     0.01  \n",
       "       (0.07)   (0.03)  \n",
       "HXZ     -0.22    -0.05  \n",
       "      (-0.57)  (-0.17)  \n",
       "HMXZ    -0.25     0.02  \n",
       "      (-0.69)   (0.06)  \n",
       "SY      -0.02     0.10  \n",
       "      (-0.09)   (0.46)  \n",
       "DHS      0.23     0.33  \n",
       "       (0.56)   (0.80)  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## Panel B of Table E.1\n",
    "### 2. Forecast Return\n",
    "idx = ['q1','q2','q3','y1','y2']\n",
    "col = abbr_list\n",
    "num_level = 5\n",
    "factor_dict = {'Ret': ['ones'],\n",
    "               'CAPM':['ones','Mkt_RF'],\n",
    "               'FF3': ['ones','Mkt_RF','SMB','HML'],\n",
    "               'FF5': ['ones','Mkt_RF','SMB', 'HML', 'RMW', 'CMA'],\n",
    "               'FFC6':['ones','Mkt_RF','SMB', 'HML', 'RMW', 'CMA','MOM'],\n",
    "               'HXZ':['ones','R_MKT','R_ME','R_IA','R_ROE'],\n",
    "               'HMXZ':['ones','R_MKT','R_ME','R_IA','R_ROE','R_EG'],\n",
    "               'SY':['ones','Mkt_RF','SMB_SY','MGMT', 'PERF'],\n",
    "               'DHS':['ones','Mkt_RF','PEAD', 'FIN'],\n",
    "               }\n",
    "\n",
    "rlts = []\n",
    "for c in col:\n",
    "    for i in idx:\n",
    "        df[f'{c}_Bias_{i}'] = (df[f'EPS_ana_{i}'] - df[f'RF_{i}_{c}'])/df['prc_l1']\n",
    "\n",
    "    # Average Bias\n",
    "    df[f'{c}_Bias_Avg'] = df[[f'{c}_Bias_q1',f'{c}_Bias_q2',f'{c}_Bias_q3',\n",
    "                              f'{c}_Bias_y1',f'{c}_Bias_y2']].mean(axis=1)\n",
    "\n",
    "    nonNA = (~df[[f'{c}_Bias_q1',f'{c}_Bias_q2',f'{c}_Bias_q3',\n",
    "                  f'{c}_Bias_y1',f'{c}_Bias_y2']].isna()).sum(axis=1)\n",
    "    df[f'{c}_Bias_Avg'] = np.where(nonNA > 1,\n",
    "                                   df[f'{c}_Bias_Avg'],\n",
    "                                   np.nan)\n",
    "\n",
    "    sort_var = f'{c}_Bias_Avg'\n",
    "    _,vwret1 = utils.SingleSort(df,'PERMNO', 'YearMonth',\n",
    "                                    sort_var, 'bh1m', num_level,\n",
    "                                    'ME', quantile_filter=None)\n",
    "    result = utils.SingleSort_RetAna(_,vwret1,'YearMonth',factor_data=all_factor,factor_dict=factor_dict,lag=12)\n",
    "    result = result['H-L']\n",
    "    result.name = c\n",
    "    rlts.append(result)\n",
    "    # break\n",
    "rlts = pd.concat(rlts,axis=1)\n",
    "rlts.to_clipboard()\n",
    "rlts"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Table F.2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_tmp = pd.read_parquet('../data/Results/df_train_new.parquet')\n",
    "all_factor = pd.read_csv('../data/Other/ff5_factors_m.CSV')\n",
    "all_factor['YearMonth'] = pd.to_datetime(all_factor['yyyymm'], format='%Y%m') + MonthEnd(0)\n",
    "all_factor['YearMonth'] = all_factor['YearMonth'] + MonthEnd(-1)\n",
    "param_grid = ParameterGrid({'sample': [0.01, 0.05, 0.10],\n",
    "                            'feature': ['sqrt', 1.0],\n",
    "                            'window': [12, 24, 36, 48, 60],\n",
    "                            'std': ['zscore','raw', 'rank',], #\n",
    "                            'eps': [True, False],\n",
    "                            })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n",
      "Var:BE, Delete 91707 rows due to missing values, raw data 1370066 rows --> new data 1278359 rows\n"
     ]
    }
   ],
   "source": [
    "rlts = []\n",
    "for params in param_grid:\n",
    "\n",
    "    ## 1. Data: read in and construct average BE\n",
    "    file_path = f\"../data/Results/RF_variants_ALL/RF_sample_{params['sample']}_feature_{params['feature']}_window_{params['window']}_std_{params['std']}_eps_{params['eps']}.parquet\"\n",
    "    if os.path.exists(file_path):\n",
    "        forecast = pd.read_parquet(file_path)\n",
    "    else:\n",
    "        continue\n",
    "\n",
    "    df = df_tmp[['permno','YearMonth','bh1m','ME',\n",
    "                 'EPS_ana_q1','EPS_ana_q2','EPS_ana_q3',\n",
    "                 'EPS_ana_y1','EPS_ana_y2',\n",
    "                 'prc_l1']].merge(forecast, on=['permno','YearMonth'])\n",
    "\n",
    "    idx = ['q1','q2','q3','y1','y2']\n",
    "\n",
    "    for i in idx:\n",
    "        df[f'Bias_{i}'] = (df[f'EPS_ana_{i}'] - df[f'RF_{i}'])/df['prc_l1']\n",
    "\n",
    "    # Average Bias BE\n",
    "    df[f'BE'] = df[[f'Bias_q1',f'Bias_q2',f'Bias_q3',\n",
    "                    f'Bias_y1',f'Bias_y2']].mean(axis=1)\n",
    "\n",
    "    nonNA = (~df[[f'Bias_q1',f'Bias_q2',f'Bias_q3',\n",
    "                  f'Bias_y1',f'Bias_y2']].isna()).sum(axis=1)\n",
    "\n",
    "    df[f'BE'] = np.where(nonNA > 1,\n",
    "                         df[f'BE'],\n",
    "                         np.nan)\n",
    "\n",
    "    ## 2. Analysis: Single Sort and FF5 alpha\n",
    "    sort_var = f'BE'\n",
    "    num_level = 5\n",
    "    _,vwret1 = utils.SingleSort(df,'PERMNO', 'YearMonth',\n",
    "                                    sort_var, 'bh1m', num_level,\n",
    "                                    'ME', quantile_filter=None)\n",
    "    vwret1 = vwret1.merge(all_factor, left_index=True, right_on='YearMonth')\n",
    "\n",
    "    vwret1['ones'] = 1\n",
    "    mdl = sm.OLS(vwret1['H-L'], vwret1[['ones','Mkt_RF','SMB', 'HML', 'RMW', 'CMA']]).fit(cov_type = 'HAC', cov_kwds = {'maxlags':12})\n",
    "    params['FF5 alpha'] = mdl.params['ones']\n",
    "    params['FF5 t'] = mdl.tvalues['ones']\n",
    "\n",
    "    mdl = sm.OLS(vwret1['H-L'], vwret1[['ones','Mkt_RF','SMB', 'HML', 'RMW', 'CMA', 'MOM']]).fit(cov_type = 'HAC', cov_kwds = {'maxlags':12})\n",
    "    params['FFC6 alpha'] = mdl.params['ones']\n",
    "    params['FFC6 t'] = mdl.tvalues['ones']\n",
    "\n",
    "    rlts.append(params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>eps</th>\n",
       "      <th>feature</th>\n",
       "      <th>sample</th>\n",
       "      <th>std</th>\n",
       "      <th>window</th>\n",
       "      <th>FF5 alpha</th>\n",
       "      <th>FF5 t</th>\n",
       "      <th>FFC6 alpha</th>\n",
       "      <th>FFC6 t</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>T</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.10</td>\n",
       "      <td>rank</td>\n",
       "      <td>12</td>\n",
       "      <td>-0.535254</td>\n",
       "      <td>-2.082017</td>\n",
       "      <td>-0.164156</td>\n",
       "      <td>-0.690010</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>F</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.10</td>\n",
       "      <td>rank</td>\n",
       "      <td>12</td>\n",
       "      <td>-0.535356</td>\n",
       "      <td>-1.993661</td>\n",
       "      <td>-0.146970</td>\n",
       "      <td>-0.573792</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>T</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.05</td>\n",
       "      <td>rank</td>\n",
       "      <td>12</td>\n",
       "      <td>-0.494247</td>\n",
       "      <td>-1.866270</td>\n",
       "      <td>-0.108752</td>\n",
       "      <td>-0.456507</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>F</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.05</td>\n",
       "      <td>rank</td>\n",
       "      <td>12</td>\n",
       "      <td>-0.492475</td>\n",
       "      <td>-1.865159</td>\n",
       "      <td>-0.097387</td>\n",
       "      <td>-0.391027</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>F</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.10</td>\n",
       "      <td>rank</td>\n",
       "      <td>24</td>\n",
       "      <td>-0.444342</td>\n",
       "      <td>-1.744866</td>\n",
       "      <td>-0.127954</td>\n",
       "      <td>-0.525047</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>T</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.05</td>\n",
       "      <td>rank</td>\n",
       "      <td>24</td>\n",
       "      <td>-0.422566</td>\n",
       "      <td>-1.710320</td>\n",
       "      <td>-0.114651</td>\n",
       "      <td>-0.478906</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>T</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.10</td>\n",
       "      <td>rank</td>\n",
       "      <td>24</td>\n",
       "      <td>-0.415885</td>\n",
       "      <td>-1.655361</td>\n",
       "      <td>-0.116279</td>\n",
       "      <td>-0.473596</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>T</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.10</td>\n",
       "      <td>raw</td>\n",
       "      <td>12</td>\n",
       "      <td>-0.433903</td>\n",
       "      <td>-1.539355</td>\n",
       "      <td>-0.040848</td>\n",
       "      <td>-0.173034</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>F</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.10</td>\n",
       "      <td>raw</td>\n",
       "      <td>24</td>\n",
       "      <td>-0.413897</td>\n",
       "      <td>-1.484085</td>\n",
       "      <td>-0.028489</td>\n",
       "      <td>-0.132925</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>F</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.05</td>\n",
       "      <td>rank</td>\n",
       "      <td>24</td>\n",
       "      <td>-0.379124</td>\n",
       "      <td>-1.481633</td>\n",
       "      <td>-0.053962</td>\n",
       "      <td>-0.217733</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>T</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.10</td>\n",
       "      <td>raw</td>\n",
       "      <td>24</td>\n",
       "      <td>-0.393785</td>\n",
       "      <td>-1.462966</td>\n",
       "      <td>-0.012606</td>\n",
       "      <td>-0.059504</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>F</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.10</td>\n",
       "      <td>zscore</td>\n",
       "      <td>24</td>\n",
       "      <td>-0.428472</td>\n",
       "      <td>-1.432119</td>\n",
       "      <td>-0.062960</td>\n",
       "      <td>-0.260605</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>F</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.05</td>\n",
       "      <td>raw</td>\n",
       "      <td>12</td>\n",
       "      <td>-0.413856</td>\n",
       "      <td>-1.418407</td>\n",
       "      <td>-0.020245</td>\n",
       "      <td>-0.085771</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>T</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.05</td>\n",
       "      <td>raw</td>\n",
       "      <td>12</td>\n",
       "      <td>-0.396169</td>\n",
       "      <td>-1.385562</td>\n",
       "      <td>-0.000314</td>\n",
       "      <td>-0.001290</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>F</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.05</td>\n",
       "      <td>raw</td>\n",
       "      <td>24</td>\n",
       "      <td>-0.388459</td>\n",
       "      <td>-1.375758</td>\n",
       "      <td>-0.000295</td>\n",
       "      <td>-0.001314</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>F</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.10</td>\n",
       "      <td>zscore</td>\n",
       "      <td>12</td>\n",
       "      <td>-0.427388</td>\n",
       "      <td>-1.361662</td>\n",
       "      <td>-0.029595</td>\n",
       "      <td>-0.116850</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>F</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.05</td>\n",
       "      <td>zscore</td>\n",
       "      <td>24</td>\n",
       "      <td>-0.405464</td>\n",
       "      <td>-1.322234</td>\n",
       "      <td>-0.025025</td>\n",
       "      <td>-0.102326</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>T</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.05</td>\n",
       "      <td>raw</td>\n",
       "      <td>24</td>\n",
       "      <td>-0.372198</td>\n",
       "      <td>-1.321757</td>\n",
       "      <td>0.018741</td>\n",
       "      <td>0.082344</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>F</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.10</td>\n",
       "      <td>raw</td>\n",
       "      <td>12</td>\n",
       "      <td>-0.357356</td>\n",
       "      <td>-1.296935</td>\n",
       "      <td>0.033956</td>\n",
       "      <td>0.158650</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>T</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.05</td>\n",
       "      <td>zscore</td>\n",
       "      <td>24</td>\n",
       "      <td>-0.378503</td>\n",
       "      <td>-1.268978</td>\n",
       "      <td>-0.019557</td>\n",
       "      <td>-0.077613</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>F</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.05</td>\n",
       "      <td>zscore</td>\n",
       "      <td>12</td>\n",
       "      <td>-0.389144</td>\n",
       "      <td>-1.260622</td>\n",
       "      <td>0.018106</td>\n",
       "      <td>0.072732</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>T</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.10</td>\n",
       "      <td>zscore</td>\n",
       "      <td>12</td>\n",
       "      <td>-0.401323</td>\n",
       "      <td>-1.215899</td>\n",
       "      <td>0.003935</td>\n",
       "      <td>0.014673</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>T</td>\n",
       "      <td>sqrt</td>\n",
       "      <td>0.10</td>\n",
       "      <td>zscore</td>\n",
       "      <td>12</td>\n",
       "      <td>-0.308330</td>\n",
       "      <td>-1.203317</td>\n",
       "      <td>-0.091804</td>\n",
       "      <td>-0.412803</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>T</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.10</td>\n",
       "      <td>zscore</td>\n",
       "      <td>24</td>\n",
       "      <td>-0.354974</td>\n",
       "      <td>-1.176481</td>\n",
       "      <td>-0.010871</td>\n",
       "      <td>-0.042182</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>T</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.05</td>\n",
       "      <td>zscore</td>\n",
       "      <td>12</td>\n",
       "      <td>-0.364359</td>\n",
       "      <td>-1.155633</td>\n",
       "      <td>0.042878</td>\n",
       "      <td>0.169793</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>T</td>\n",
       "      <td>sqrt</td>\n",
       "      <td>0.05</td>\n",
       "      <td>zscore</td>\n",
       "      <td>12</td>\n",
       "      <td>-0.279352</td>\n",
       "      <td>-1.090407</td>\n",
       "      <td>-0.068884</td>\n",
       "      <td>-0.318226</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>T</td>\n",
       "      <td>sqrt</td>\n",
       "      <td>0.10</td>\n",
       "      <td>rank</td>\n",
       "      <td>12</td>\n",
       "      <td>-0.257663</td>\n",
       "      <td>-1.078414</td>\n",
       "      <td>-0.063909</td>\n",
       "      <td>-0.297702</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>T</td>\n",
       "      <td>sqrt</td>\n",
       "      <td>0.05</td>\n",
       "      <td>rank</td>\n",
       "      <td>12</td>\n",
       "      <td>-0.239232</td>\n",
       "      <td>-1.006223</td>\n",
       "      <td>-0.037932</td>\n",
       "      <td>-0.181571</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>T</td>\n",
       "      <td>sqrt</td>\n",
       "      <td>0.10</td>\n",
       "      <td>rank</td>\n",
       "      <td>24</td>\n",
       "      <td>-0.242769</td>\n",
       "      <td>-0.972822</td>\n",
       "      <td>-0.074482</td>\n",
       "      <td>-0.315472</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>T</td>\n",
       "      <td>sqrt</td>\n",
       "      <td>0.10</td>\n",
       "      <td>zscore</td>\n",
       "      <td>24</td>\n",
       "      <td>-0.252966</td>\n",
       "      <td>-0.958551</td>\n",
       "      <td>-0.066659</td>\n",
       "      <td>-0.288933</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>T</td>\n",
       "      <td>sqrt</td>\n",
       "      <td>0.05</td>\n",
       "      <td>zscore</td>\n",
       "      <td>24</td>\n",
       "      <td>-0.225708</td>\n",
       "      <td>-0.866227</td>\n",
       "      <td>-0.037706</td>\n",
       "      <td>-0.163109</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>T</td>\n",
       "      <td>sqrt</td>\n",
       "      <td>0.05</td>\n",
       "      <td>rank</td>\n",
       "      <td>24</td>\n",
       "      <td>-0.204215</td>\n",
       "      <td>-0.821700</td>\n",
       "      <td>-0.031589</td>\n",
       "      <td>-0.135746</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>T</td>\n",
       "      <td>sqrt</td>\n",
       "      <td>0.10</td>\n",
       "      <td>raw</td>\n",
       "      <td>12</td>\n",
       "      <td>-0.161937</td>\n",
       "      <td>-0.761720</td>\n",
       "      <td>0.112176</td>\n",
       "      <td>0.618328</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>F</td>\n",
       "      <td>sqrt</td>\n",
       "      <td>0.10</td>\n",
       "      <td>rank</td>\n",
       "      <td>24</td>\n",
       "      <td>-0.143638</td>\n",
       "      <td>-0.578912</td>\n",
       "      <td>-0.032699</td>\n",
       "      <td>-0.140898</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>F</td>\n",
       "      <td>sqrt</td>\n",
       "      <td>0.10</td>\n",
       "      <td>zscore</td>\n",
       "      <td>24</td>\n",
       "      <td>-0.126283</td>\n",
       "      <td>-0.520683</td>\n",
       "      <td>-0.007580</td>\n",
       "      <td>-0.032701</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>F</td>\n",
       "      <td>sqrt</td>\n",
       "      <td>0.10</td>\n",
       "      <td>rank</td>\n",
       "      <td>12</td>\n",
       "      <td>-0.106944</td>\n",
       "      <td>-0.479095</td>\n",
       "      <td>0.055628</td>\n",
       "      <td>0.282385</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>F</td>\n",
       "      <td>sqrt</td>\n",
       "      <td>0.05</td>\n",
       "      <td>zscore</td>\n",
       "      <td>12</td>\n",
       "      <td>-0.099554</td>\n",
       "      <td>-0.426635</td>\n",
       "      <td>0.062004</td>\n",
       "      <td>0.304638</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>T</td>\n",
       "      <td>sqrt</td>\n",
       "      <td>0.05</td>\n",
       "      <td>raw</td>\n",
       "      <td>12</td>\n",
       "      <td>-0.089900</td>\n",
       "      <td>-0.401163</td>\n",
       "      <td>0.191153</td>\n",
       "      <td>1.036986</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>F</td>\n",
       "      <td>sqrt</td>\n",
       "      <td>0.05</td>\n",
       "      <td>rank</td>\n",
       "      <td>24</td>\n",
       "      <td>-0.097769</td>\n",
       "      <td>-0.399598</td>\n",
       "      <td>0.023509</td>\n",
       "      <td>0.102205</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>F</td>\n",
       "      <td>sqrt</td>\n",
       "      <td>0.10</td>\n",
       "      <td>zscore</td>\n",
       "      <td>12</td>\n",
       "      <td>-0.092032</td>\n",
       "      <td>-0.394722</td>\n",
       "      <td>0.081574</td>\n",
       "      <td>0.397835</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>F</td>\n",
       "      <td>sqrt</td>\n",
       "      <td>0.05</td>\n",
       "      <td>zscore</td>\n",
       "      <td>24</td>\n",
       "      <td>-0.093035</td>\n",
       "      <td>-0.377918</td>\n",
       "      <td>0.028447</td>\n",
       "      <td>0.120564</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>F</td>\n",
       "      <td>sqrt</td>\n",
       "      <td>0.05</td>\n",
       "      <td>rank</td>\n",
       "      <td>12</td>\n",
       "      <td>-0.079964</td>\n",
       "      <td>-0.360679</td>\n",
       "      <td>0.075555</td>\n",
       "      <td>0.376689</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>T</td>\n",
       "      <td>sqrt</td>\n",
       "      <td>0.10</td>\n",
       "      <td>raw</td>\n",
       "      <td>24</td>\n",
       "      <td>-0.065971</td>\n",
       "      <td>-0.282916</td>\n",
       "      <td>0.213827</td>\n",
       "      <td>1.087477</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>T</td>\n",
       "      <td>sqrt</td>\n",
       "      <td>0.05</td>\n",
       "      <td>raw</td>\n",
       "      <td>24</td>\n",
       "      <td>-0.040678</td>\n",
       "      <td>-0.183042</td>\n",
       "      <td>0.242280</td>\n",
       "      <td>1.249070</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>F</td>\n",
       "      <td>sqrt</td>\n",
       "      <td>0.10</td>\n",
       "      <td>raw</td>\n",
       "      <td>24</td>\n",
       "      <td>0.066142</td>\n",
       "      <td>0.301697</td>\n",
       "      <td>0.292153</td>\n",
       "      <td>1.569475</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>F</td>\n",
       "      <td>sqrt</td>\n",
       "      <td>0.05</td>\n",
       "      <td>raw</td>\n",
       "      <td>12</td>\n",
       "      <td>0.075733</td>\n",
       "      <td>0.356840</td>\n",
       "      <td>0.287681</td>\n",
       "      <td>1.578678</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>F</td>\n",
       "      <td>sqrt</td>\n",
       "      <td>0.10</td>\n",
       "      <td>raw</td>\n",
       "      <td>12</td>\n",
       "      <td>0.081529</td>\n",
       "      <td>0.395578</td>\n",
       "      <td>0.308464</td>\n",
       "      <td>1.753501</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>F</td>\n",
       "      <td>sqrt</td>\n",
       "      <td>0.05</td>\n",
       "      <td>raw</td>\n",
       "      <td>24</td>\n",
       "      <td>0.087403</td>\n",
       "      <td>0.396033</td>\n",
       "      <td>0.309285</td>\n",
       "      <td>1.674049</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   eps feature  sample     std  window  FF5 alpha     FF5 t  FFC6 alpha  \\\n",
       "22   T     1.0    0.10    rank      12  -0.535254 -2.082017   -0.164156   \n",
       "46   F     1.0    0.10    rank      12  -0.535356 -1.993661   -0.146970   \n",
       "16   T     1.0    0.05    rank      12  -0.494247 -1.866270   -0.108752   \n",
       "40   F     1.0    0.05    rank      12  -0.492475 -1.865159   -0.097387   \n",
       "47   F     1.0    0.10    rank      24  -0.444342 -1.744866   -0.127954   \n",
       "17   T     1.0    0.05    rank      24  -0.422566 -1.710320   -0.114651   \n",
       "23   T     1.0    0.10    rank      24  -0.415885 -1.655361   -0.116279   \n",
       "20   T     1.0    0.10     raw      12  -0.433903 -1.539355   -0.040848   \n",
       "45   F     1.0    0.10     raw      24  -0.413897 -1.484085   -0.028489   \n",
       "41   F     1.0    0.05    rank      24  -0.379124 -1.481633   -0.053962   \n",
       "21   T     1.0    0.10     raw      24  -0.393785 -1.462966   -0.012606   \n",
       "43   F     1.0    0.10  zscore      24  -0.428472 -1.432119   -0.062960   \n",
       "38   F     1.0    0.05     raw      12  -0.413856 -1.418407   -0.020245   \n",
       "14   T     1.0    0.05     raw      12  -0.396169 -1.385562   -0.000314   \n",
       "39   F     1.0    0.05     raw      24  -0.388459 -1.375758   -0.000295   \n",
       "42   F     1.0    0.10  zscore      12  -0.427388 -1.361662   -0.029595   \n",
       "37   F     1.0    0.05  zscore      24  -0.405464 -1.322234   -0.025025   \n",
       "15   T     1.0    0.05     raw      24  -0.372198 -1.321757    0.018741   \n",
       "44   F     1.0    0.10     raw      12  -0.357356 -1.296935    0.033956   \n",
       "13   T     1.0    0.05  zscore      24  -0.378503 -1.268978   -0.019557   \n",
       "36   F     1.0    0.05  zscore      12  -0.389144 -1.260622    0.018106   \n",
       "18   T     1.0    0.10  zscore      12  -0.401323 -1.215899    0.003935   \n",
       "6    T    sqrt    0.10  zscore      12  -0.308330 -1.203317   -0.091804   \n",
       "19   T     1.0    0.10  zscore      24  -0.354974 -1.176481   -0.010871   \n",
       "12   T     1.0    0.05  zscore      12  -0.364359 -1.155633    0.042878   \n",
       "0    T    sqrt    0.05  zscore      12  -0.279352 -1.090407   -0.068884   \n",
       "10   T    sqrt    0.10    rank      12  -0.257663 -1.078414   -0.063909   \n",
       "4    T    sqrt    0.05    rank      12  -0.239232 -1.006223   -0.037932   \n",
       "11   T    sqrt    0.10    rank      24  -0.242769 -0.972822   -0.074482   \n",
       "7    T    sqrt    0.10  zscore      24  -0.252966 -0.958551   -0.066659   \n",
       "1    T    sqrt    0.05  zscore      24  -0.225708 -0.866227   -0.037706   \n",
       "5    T    sqrt    0.05    rank      24  -0.204215 -0.821700   -0.031589   \n",
       "8    T    sqrt    0.10     raw      12  -0.161937 -0.761720    0.112176   \n",
       "35   F    sqrt    0.10    rank      24  -0.143638 -0.578912   -0.032699   \n",
       "31   F    sqrt    0.10  zscore      24  -0.126283 -0.520683   -0.007580   \n",
       "34   F    sqrt    0.10    rank      12  -0.106944 -0.479095    0.055628   \n",
       "24   F    sqrt    0.05  zscore      12  -0.099554 -0.426635    0.062004   \n",
       "2    T    sqrt    0.05     raw      12  -0.089900 -0.401163    0.191153   \n",
       "29   F    sqrt    0.05    rank      24  -0.097769 -0.399598    0.023509   \n",
       "30   F    sqrt    0.10  zscore      12  -0.092032 -0.394722    0.081574   \n",
       "25   F    sqrt    0.05  zscore      24  -0.093035 -0.377918    0.028447   \n",
       "28   F    sqrt    0.05    rank      12  -0.079964 -0.360679    0.075555   \n",
       "9    T    sqrt    0.10     raw      24  -0.065971 -0.282916    0.213827   \n",
       "3    T    sqrt    0.05     raw      24  -0.040678 -0.183042    0.242280   \n",
       "33   F    sqrt    0.10     raw      24   0.066142  0.301697    0.292153   \n",
       "26   F    sqrt    0.05     raw      12   0.075733  0.356840    0.287681   \n",
       "32   F    sqrt    0.10     raw      12   0.081529  0.395578    0.308464   \n",
       "27   F    sqrt    0.05     raw      24   0.087403  0.396033    0.309285   \n",
       "\n",
       "      FFC6 t  \n",
       "22 -0.690010  \n",
       "46 -0.573792  \n",
       "16 -0.456507  \n",
       "40 -0.391027  \n",
       "47 -0.525047  \n",
       "17 -0.478906  \n",
       "23 -0.473596  \n",
       "20 -0.173034  \n",
       "45 -0.132925  \n",
       "41 -0.217733  \n",
       "21 -0.059504  \n",
       "43 -0.260605  \n",
       "38 -0.085771  \n",
       "14 -0.001290  \n",
       "39 -0.001314  \n",
       "42 -0.116850  \n",
       "37 -0.102326  \n",
       "15  0.082344  \n",
       "44  0.158650  \n",
       "13 -0.077613  \n",
       "36  0.072732  \n",
       "18  0.014673  \n",
       "6  -0.412803  \n",
       "19 -0.042182  \n",
       "12  0.169793  \n",
       "0  -0.318226  \n",
       "10 -0.297702  \n",
       "4  -0.181571  \n",
       "11 -0.315472  \n",
       "7  -0.288933  \n",
       "1  -0.163109  \n",
       "5  -0.135746  \n",
       "8   0.618328  \n",
       "35 -0.140898  \n",
       "31 -0.032701  \n",
       "34  0.282385  \n",
       "24  0.304638  \n",
       "2   1.036986  \n",
       "29  0.102205  \n",
       "30  0.397835  \n",
       "25  0.120564  \n",
       "28  0.376689  \n",
       "9   1.087477  \n",
       "3   1.249070  \n",
       "33  1.569475  \n",
       "26  1.578678  \n",
       "32  1.753501  \n",
       "27  1.674049  "
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rlts = pd.DataFrame(rlts)\n",
    "rlts.sort_values(by='FF5 t',ascending=True, inplace=True)\n",
    "rlts['eps'] = rlts['eps'].replace({True:'T',False:'F'})\n",
    "rlts[['sample','feature','window','std','eps','FF5 alpha','FF5 t','FFC6 alpha','FFC6 t']].to_clipboard()\n",
    "rlts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
